#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
@File    :   reduce.py
@Time    :   2022/10/13 23:01:33
@Author  :   Haowei Xu
@Site    :   https://howiehsu0126.github.io/
@License :   (C)Copyright 2018-2022, iOPEN, NWPU
@Desc    :   None
'''

import numpy as np
import pandas as pd
import os
import random
import torch

from pprint import pprint
from time import time
from sklearn.impute import SimpleImputer
from sklearn.covariance import EllipticEnvelope
from sklearn.impute import KNNImputer
from category_encoders import OneHotEncoder
from sklearn.preprocessing import StandardScaler


def get_base_model_list(models):
    return [(model.__class__.__name__, model) for model in models]


def seed_everything(seed, pytorch_init=True):
    """
    Seeds basic parameters for reproducibility of results
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    if pytorch_init is True:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    print(f'Seed for reproducibility of results has been setted to: {seed}')


def reduce_mem_usage(df, verbose=True):  # sourcery skip: low-code-quality
    """
    Reducing the size of your data.
    NOTE: Please apply it after feature engineering or before major transformations that do not rescale your existing data.
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type).startswith('int'):
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def data_cleaning(df, categorical_features, numerical_features, fill_list, drop_list, anomaly_list, target_col, mode='train'):
    raw_df = df.copy(deep=True)
    raw_df = raw_df.drop(labels=drop_list, axis=1)

    # Fill categorical missing values with mode
    for col in fill_list:
        raw_df[col] = SimpleImputer(strategy='most_frequent').fit_transform(
            raw_df[col].values.reshape(-1, 1))

    # Detect outliers and impute them using KNNImputer
    for col in anomaly_list:
        outliers_prediction = EllipticEnvelope().fit(
            raw_df[col].values.reshape(-1, 1)).predict(raw_df[col].values.reshape(-1, 1))
        raw_df[col][np.array(
            np.where(outliers_prediction == -1)).ravel()] = np.nan
        raw_df[col] = KNNImputer().fit_transform(
            raw_df[col].values.reshape(-1, 1))

    # Encode categorical values
    raw_df = OneHotEncoder(
        cols=categorical_features, use_cat_names=True).fit_transform(raw_df)  # Ont-Hot
    # Standardize numerical values
    raw_df[numerical_features] = pd.DataFrame(StandardScaler().fit_transform(
        raw_df[numerical_features]), columns=numerical_features)  # StandardScaler
    # Obtain target values
    target = raw_df.pop(target_col) if mode == 'train' else None

    print(f'Data cleaning for {mode} set has been completed successfully.')
    return raw_df, target

# Reporting util for different optimizers


# Reporting util for different optimizers
def report_perf(optimizer, X, y, title="model", callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers

    optimizer = a sklearn or a skopt optimizer
    X = the training set
    y = our target
    title = a string label for the experiment
    """
    start = time()

    if callbacks is not None:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)

    d = pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_

    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           + u"\u00B1"+" %.3f") % (time() - start,
                                   len(optimizer.cv_results_['params']),
                                   best_score,
                                   best_score_std))
    print('Best parameters:')
    pprint(best_params)
    print()
    return best_params
